from sklearn import tree
import math
import pandas as pd
import numpy as np
from sklearn import tree
import matplotlib as plt
data=pd.read_csv('UCI_Credit_Card.csv',index_col=[0])
#pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns',None)
column={'ID':'User ID',
        'SeriousDlqin2yrs':'Good and Bad Customers',
        'RevolvingUtilizationOfUnsecuredLines':'Ratio of available credits',
        'age':'Age',
        'NumberOfTimes':'Number Of Times',
        'NumberOfTime30-59DaysPastDueNotWorse':'Number Of Time 30-59 Days Past Due ',
        'DebtRatio':'DebtRatio',
        'MonthlyIncome':'Monthly Income',
        'NumberOfOpenCreditLinesAndLoans':'Number Of Credits',
        'NumberOfTimes90DaysLate':'Number of 90 days past due loans',
        'NumberRealEstate':'Number of loans',
        'NumberRealEstateLoansOrLines':'Fixed Asset Loan Volume',
        'NumberOfTime60-89DaysPastDueNotWorse':'Number Of Time 60-89 Days Past Due',
        'NumberOfDependents':'Number Of family members',
        'DefaultRate':'Default rate',
        'GoodDebt':'Number of credits never defaulted',
        'RemainingIncome':'Monthly Remaining Income',
        'EstateLoan':'Fixed Asset Loan Ratio',
        'Comsumption Loan': 'Level of debt per capita',
        'AverageIncome':'Average income per household member',
        'AverageDebtLevel': 'Number of debt per household member',
        'CreaditUsedRatio':'Personal Credit Used Ratio',
        '30-59DaysRatio': 'Number of 30-59 day past due loans as a proportion of total past due',
        '60-89DaysRatio':  'Number of 60-89 day past due loans as a proportion of total past due',
        'Debt1': 'Whether over-indebted',
        'CreaditRatio':'Whether credit limit is over utilized'}
# data.rename(columns=column,inplace=True)
rs = 18  # 分层抽样生成种子 # 表现较好-3、7、18(最好)
# 4 10 17
# print(data.describe())
# print(data.info())
data.drop_duplicates(inplace=True)    # 去重

print(data.describe())



# 特征处理

LIMIT_BAL_ =data['LIMIT_BAL'].mean()+3*data['LIMIT_BAL'].std()
data = data[data['LIMIT_BAL'] <= LIMIT_BAL_]

data = data.drop(['SEX'], axis=1)
data = data.drop(['MARRIAGE'], axis=1)

# 数据集信息
print(data.info())
print(data.describe())
# print(data['MonthlyIncome'].isnull())
# print(data['NumberOfDependents'].isnull())

def cal_Chi2(df):
    """从列联表计算出卡方值"""
    res = []
    # 计算values的和
    num_sum = sum(df.values.flatten())
    for i in range(df.shape[0]):
        for j in range(df.shape[1]):
            # 计算位置i,j上的期望值
            e = sum(df.iloc[i, :]) * sum(df.iloc[:, j]) / num_sum
            tt = (df.iloc[i, j] - e) ** 2 / e
            res.append(tt)
    return sum(res)


def line_merge(df, i, j):
    """将i,j行合并"""
    df.iloc[i, 1] = df.iloc[i, 1] + df.iloc[j, 1]
    df.iloc[i, 2] = df.iloc[i, 2] + df.iloc[j, 2]
    df.iloc[i, 0] = df.iloc[j, 0]
    df = pd.concat([df.iloc[:j, :], df.iloc[j + 1:, :]])
    return df


# 定义一个卡方分箱（可设置参数置信度水平与箱的个数）停止条件为大于置信水平且小于bin的数目(confidenceVal-置信度，bin-最大分箱数）
def ChiMerge(df, variable, flag='default.payment.next.month', confidenceVal=9.542, bin=22): # bin=8左右为无零区 bin=20最好
    '''
    df:pass in a dataframe containing only one variable to be chi-squared with positive and negative sample identifiers (1 for positive samples, 0 for negative samples)
    variable: name of the variable to be chi-square binned (string)
    flag: name of the positive and negative sample identifiers (string)
    confidenceVal: confidence level (default is 95% without sampling)
    bin: maximum number of bins
    '''
    x1 = data[variable].values
    boundary = []
    # 进行数据格式化录入
    regroup = df.groupby([variable])[flag].agg(["size", "sum"])
    regroup.columns = ['total_num', 'positive_class']
    regroup['negative_class'] = regroup['total_num'] - regroup['positive_class']  # 统计需分箱变量每个值负样本数
    regroup = regroup.drop('total_num', axis=1).reset_index()
    col_names = regroup.columns

    print('Data reading completed, initial calculations being processed.')

    i = 0
    while (i <= regroup.shape[0] - 2):
        if sum(regroup.iloc[[i, i + 1], [1, 2]].sum() == 0) > 0:
            regroup = line_merge(regroup, i, i + 1)
            i = i - 1
        i = i + 1

    chi_ls = []
    for i in np.arange(regroup.shape[0] - 1):
        chi = cal_Chi2(regroup.iloc[[i, i + 1], [1, 2]])
        chi_ls.append(chi)

    print('Initial calculations completed，chi-square bining being processed.')

    # Combine the two intervals with the smallest chi-square values
    while True:
        if (len(chi_ls) <= (bin - 1) and min(chi_ls) >= confidenceVal):
            break

        min_ind = chi_ls.index(min(chi_ls))
        regroup = line_merge(regroup, min_ind, min_ind + 1)

        if (min_ind == regroup.shape[0] - 1):
            chi_ls[min_ind - 1] = cal_Chi2(regroup.iloc[[min_ind, min_ind - 1], [1, 2]])
            del chi_ls[min_ind]

        else:
            chi_ls[min_ind - 1] = cal_Chi2(regroup.iloc[[min_ind, min_ind - 1], [1, 2]])
            chi_ls[min_ind] = cal_Chi2(regroup.iloc[[min_ind, min_ind + 1], [1, 2]])

            del chi_ls[min_ind + 1]

    print('Chi-square bining completed,saving results')



    list_temp = []
    for i in np.arange(regroup.shape[0]):
        if i == 0:
            x = '-inf' + '~' + str(regroup.iloc[i, 0])
        elif i == regroup.shape[0] - 1:
            x = str(regroup.iloc[i - 1, 0]) + '+'
        else:
            x = str(regroup.iloc[i - 1, 0]) + '~' + str(regroup.iloc[i, 0])
        list_temp.append(x)
    regroup[variable] = list_temp
    print(regroup)
    # print(type(regroup))
    # print(regroup[variable])
    # print(list(regroup[variable][:-1]))
    for i in list(regroup[variable])[:-1]:
        boundary.append(eval(i.split('~')[1]))
    # print(boundary)

    min_x = x1.min()
    max_x = x1.max() + 0.1
    boundary = [min_x] + boundary + [max_x]
    # print(boundary)
    boundary = list(set(boundary))
    boundary.sort()
    print(boundary)
    # return regroup
    return boundary


# Calculate IV
def feature_woe_iv(x: pd.Series, y: pd.Series, column, data) -> pd.DataFrame:



    # x = x.fillna(nan)
    print(column)
    boundary = ChiMerge(df=data,variable=column)
    df = pd.concat([x, y], axis=1)
    df.columns = ['x', 'y']
    data.loc[:, column] = pd.cut(data.loc[:, column], bins=boundary, labels=[i for i in range(len(boundary) - 1)],include_lowest=True)
    df['bins'] = pd.cut(x=x, bins=boundary, right=False)

    grouped = df.groupby('bins')['y']
    result_df = grouped.agg([('good', lambda y: (y == 0).sum()),
                             ('bad', lambda y: (y == 1).sum()),
                             ('total', 'count')])

    result_df['good_pct'] = result_df['good'] / result_df['good'].sum()   # Percentage of customers not in default
    result_df['bad_pct'] = result_df['bad'] / result_df['bad'].sum()   # Percentage of customers in default
    result_df['total_pct'] = result_df['total'] / result_df['total'].sum()

    result_df['bad_rate'] = result_df['bad'] / result_df['total']  # Default rate

    result_df['woe'] = np.log(result_df['good_pct'] / result_df['bad_pct'])  # WOE
    result_df['iv'] = (result_df['good_pct'] - result_df['bad_pct']) * result_df['woe']  # IV
    if result_df['iv'].sum() <= -0.05:
        data = data.drop([column], axis=1)
    print(column + f" IV = {result_df['iv'].sum()}")

    return data


data.insert(0, 'default.payment.next.month', data.pop('default.payment.next.month'))

for column in data.iloc[:,1:].columns: #calculate IV
    data = feature_woe_iv(x=data[column], y=data['default.payment.next.month'],column =column ,data=data)
data = data.reset_index(drop=True)
# data.to_csv('cs-training1.csv', index=1)


# Dataset segmentation

from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=rs) # 0.2 0.3
# Sampling according to mnist["target"]
for train_index, test_index in split.split(data.iloc[:, 1:], data.iloc[:, 0]): # split_split(X,y)
    user_train = data.iloc[train_index]
    user_train_target = user_train['default.payment.next.month']
    user_test = data.iloc[test_index]
    user_test_target = user_test['default.payment.next.month']
user_train.to_csv('UCI_Credit_Card_train_chi.csv', index=1)
user_test.to_csv('UCI_Credit_Card_test_chi.csv', index=1)